library(tidyverse)
library(glmnet)
library(caret)
library(pROC)
library(VIM)
#library(performanceEstimation)
#library(mlr)
#library(UBL)
dat <- read.csv("/Users/yctang/Documents/Columbia/5291 Advanced data Analysis/project/data/hmda_2017_ny_all-records_labels.csv")
# remove some variables & NA
dat <- subset(dat, select = -c(1:4, 6, 8, 10, 12, 15, 17, 19:27, 29, 31, 33:50, 51, 53, 54, 56:71, 69:71, 78))
#aggr(dat)

Data Pre-processing

# drop missing values
dat <- drop_na(dat)

# some adjustments: re-encoding
# owner_occupancy: other == 0, Owner-occupied as a principal dwelling == 1
dat[dat$owner_occupancy == 2, ]$owner_occupancy <- 0
dat[dat$owner_occupancy == 3, ]$owner_occupancy <- 0
# loan_type: other == 0, Conventional == 1
dat[dat$loan_type == 2, ]$loan_type <- 0
dat[dat$loan_type == 3, ]$loan_type <- 0
dat[dat$loan_type == 4, ]$loan_type <- 0
# preapproval: not requested == 0
dat[dat$preapproval == 2, ]$preapproval <- 0
dat[dat$preapproval == 3, ]$preapproval <- 0
# action_taken
dat = dat %>% filter(action_taken == 1 | action_taken == 2 | action_taken == 3)
dat[dat$action_taken == 2, ]$action_taken <- 1
dat[dat$action_taken == 3, ]$action_taken <- 0
# applicant_ethnicity: other == 0, hispanic/latino == 1
dat[dat$applicant_ethnicity == 2, ]$applicant_ethnicity <- 0
dat[dat$applicant_ethnicity == 3, ]$applicant_ethnicity <- 0
dat[dat$applicant_ethnicity == 4, ]$applicant_ethnicity <- 0
# sex: unknown == 0, male == 1, female == 2
dat[dat$applicant_sex == 3, ]$applicant_sex <- 0
dat[dat$applicant_sex == 4, ]$applicant_sex <- 0
# race: other == 0, asian == 1, Black or African American == 2, white == 3
dat[dat$applicant_race_1 == 1, ]$applicant_race_1 <- 0
dat[dat$applicant_race_1 == 4, ]$applicant_race_1 <- 0
dat[dat$applicant_race_1 == 6, ]$applicant_race_1 <- 0
dat[dat$applicant_race_1 == 7, ]$applicant_race_1 <- 0
dat[dat$applicant_race_1 == 2, ]$applicant_race_1 <- 1
dat[dat$applicant_race_1 == 3, ]$applicant_race_1 <- 2
dat[dat$applicant_race_1 == 5, ]$applicant_race_1 <- 3
names(dat)[names(dat) == "applicant_race_1"] <- "applicant_race"
# co-applicant, yes/no == 1/0
dat[dat$co_applicant_ethnicity == 2, ]$co_applicant_ethnicity <- 1
dat[dat$co_applicant_ethnicity == 3, ]$co_applicant_ethnicity <- 0
dat[dat$co_applicant_ethnicity == 4, ]$co_applicant_ethnicity <- 0
dat[dat$co_applicant_ethnicity == 5, ]$co_applicant_ethnicity <- 0
names(dat)[names(dat) == "co_applicant_ethnicity"] <- "co_applicant"

# Change classes of variables
dat$agency_code <- as.factor(dat$agency_code)
dat$loan_type <- as.factor(dat$loan_type)
dat$property_type <- as.factor(dat$property_type)
dat$loan_purpose <- as.factor(dat$loan_purpose)
dat$owner_occupancy <- as.factor(dat$owner_occupancy)
dat$preapproval <- as.factor(dat$preapproval)
dat$action_taken <- as.factor(dat$action_taken)
dat$applicant_ethnicity <- as.factor(dat$applicant_ethnicity)
dat$co_applicant <- as.factor(dat$co_applicant)
dat$applicant_race <- as.factor(dat$applicant_race)
dat$applicant_sex <- as.factor(dat$applicant_sex)

Data Visualization

dat %>%
  ggplot(aes(action_taken, fill = action_taken)) + 
  geom_bar()+
  scale_y_continuous(labels = scales::percent)

#ggplot(dat, aes(x = action_taken)) + 
#    geom_bar(aes(y = (..count..)/sum(..count..))) + 
#    scale_y_continuous(formatter = 'percent')

Training-testing Set Split

set.seed(123456)
indices.old <- sample(1:nrow(dat), nrow(dat) * 0.7)
training.old <- dat[indices.old, ]
testing.old <- dat[-indices.old, ]

First Try: Logistic Regression

full <- glm(action_taken ~., family = binomial(link = 'logit'), data = training.old)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#summary(fit)
test.prob <- predict(full, testing.old, type = "response")
test.pred <- as.numeric(ifelse(test.prob > 0.5, 1, 0))
confusionMatrix(data = as.factor(test.pred), reference = testing.old$action_taken, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0  1803  1485
##          1 17168 69700
##                                           
##                Accuracy : 0.7931          
##                  95% CI : (0.7904, 0.7957)
##     No Information Rate : 0.7896          
##     P-Value [Acc > NIR] : 0.004678        
##                                           
##                   Kappa : 0.1065          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.97914         
##             Specificity : 0.09504         
##          Pos Pred Value : 0.80237         
##          Neg Pred Value : 0.54836         
##              Prevalence : 0.78958         
##          Detection Rate : 0.77310         
##    Detection Prevalence : 0.96353         
##       Balanced Accuracy : 0.53709         
##                                           
##        'Positive' Class : 1               
## 
test.roc <- roc(testing.old$action_taken ~ test.prob, plot = TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

Outliers, standardization and other adjustments

# outliers & standardization
hist(dat$applicant_income_000s)

plot(density(dat$population[dat$population < quantile(dat$population, 0.99)]))
lines(density(dat$applicant_income_000s[dat$applicant_income_000s < quantile(dat$applicant_income_000s, 0.99)]))

dat <- dat %>%
  filter(applicant_income_000s < quantile(applicant_income_000s, 0.96)) %>%
  filter(loan_amount_000s < quantile(loan_amount_000s, 0.96)) %>%
  mutate(applicant_income = as.numeric(scale(applicant_income_000s, center = FALSE)), 
         loan_amount = as.numeric(scale(loan_amount_000s, center = FALSE)), 
         population = as.numeric(scale(population, center = FALSE)), 
         hud_median_family_income = as.numeric(scale(hud_median_family_income, center = FALSE)),
         number_of_owner_occupied_units = as.numeric(scale(number_of_owner_occupied_units, center = FALSE)),
         number_of_1_to_4_family_units = as.numeric(scale(number_of_1_to_4_family_units, center = FALSE)),
         minority_population = minority_population / 100,
         tract_to_msamd_income = tract_to_msamd_income / 100
         ) %>%
  dplyr::select(-c(applicant_income_000s, loan_amount_000s)) %>%
  dplyr::select(action_taken, everything())

hist(dat$applicant_income)

plot(density(dat$population))
lines(density(dat$applicant_income))

plot(density(dat$population[dat$population<quantile(dat$population, 0.99)]))
lines(density(dat$applicant_income[dat$applicant_income<quantile(dat$applicant_income, 0.99)]))

Second Try: Logistic Regression

set.seed(123456)
dat <- sample_n(dat, 1000)
indices <- sample(1:nrow(dat), nrow(dat) * 0.7)
training <- dat[indices, ]
testing <- dat[-indices, ]
full <- glm(action_taken ~., family = binomial(link = 'logit'), data = training)
#summary(fit)
test.prob <- predict(full, testing, type = "response")
test.pred <- as.numeric(ifelse(test.prob > 0.5, 1, 0))
confusionMatrix(data = as.factor(test.pred), reference = testing$action_taken, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0   8  16
##          1  52 224
##                                           
##                Accuracy : 0.7733          
##                  95% CI : (0.7217, 0.8195)
##     No Information Rate : 0.8             
##     P-Value [Acc > NIR] : 0.8888          
##                                           
##                   Kappa : 0.086           
##                                           
##  Mcnemar's Test P-Value : 2.192e-05       
##                                           
##             Sensitivity : 0.9333          
##             Specificity : 0.1333          
##          Pos Pred Value : 0.8116          
##          Neg Pred Value : 0.3333          
##              Prevalence : 0.8000          
##          Detection Rate : 0.7467          
##    Detection Prevalence : 0.9200          
##       Balanced Accuracy : 0.5333          
##                                           
##        'Positive' Class : 1               
## 
test.roc <- roc(testing$action_taken ~ test.prob, plot = TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

SMOTE & L1 Regularization

# install the RSBID package
#install.packages("devtools")
#devtools::install_github("dongyuanwu/RSBID")
library(RSBID)
## Loading required package: FNN
## Loading required package: clustMixType
## Loading required package: klaR
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
#training.dummy$action_taken1 <- as.factor(training.dummy$action_taken1)
#testing.dummy$action_taken1 <- as.factor(testing.dummy$action_taken1)

# too slow! sample 50000 for testing
#set.seed(123456)
#training <- sample_n(training, 50000)

ptm <- proc.time()
training.bal <- SMOTE_NC(training, "action_taken")
## Variables are continous and categorical, SMOTE_NC could be used.
proc.time() - ptm # running time
##    user  system elapsed 
##  11.882   0.088  12.167
training.bal.dummy <- data.frame(model.matrix( ~ ., training.bal)[, -1])
testing.dummy <- data.frame(model.matrix( ~ ., testing)[, -1])
X <- as.matrix(training.bal.dummy[-1])
Y <- training.bal.dummy$action_taken1
cv <- cv.glmnet(X, Y, family = "binomial")
fit.L1 <- glmnet(X, Y, family = "binomial", alpha = 1, lambda = cv$lambda.min)
# confusion matrix and roc curve
test.prob <- fit.L1 %>% predict(newx = as.matrix(testing.dummy[-1]))
test.pred <- as.numeric(ifelse(test.prob > 0.5, 1, 0))
mean(test.pred == testing.dummy$action_taken1)
## [1] 0.5766667
confusionMatrix(data = as.factor(test.pred), reference = factor(testing.dummy$action_taken1), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  40 107
##          1  20 133
##                                           
##                Accuracy : 0.5767          
##                  95% CI : (0.5186, 0.6332)
##     No Information Rate : 0.8             
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.143           
##                                           
##  Mcnemar's Test P-Value : 2.325e-14       
##                                           
##             Sensitivity : 0.5542          
##             Specificity : 0.6667          
##          Pos Pred Value : 0.8693          
##          Neg Pred Value : 0.2721          
##              Prevalence : 0.8000          
##          Detection Rate : 0.4433          
##    Detection Prevalence : 0.5100          
##       Balanced Accuracy : 0.6104          
##                                           
##        'Positive' Class : 1               
## 
test.roc <- roc(testing.dummy$action_taken1 ~ as.numeric(test.prob), plot = TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

乱七八糟瞎try

Try: Neural Network

library(neuralnet)
ptm <- proc.time()
set.seed(123456)
NN = neuralnet(action_taken1 ~ ., training.bal.dummy, hidden = 5, linear.output = FALSE, err.fct = 'ce', stepmax = 1e7)
proc.time() - ptm # running time
##    user  system elapsed 
##  85.252   2.654  89.757
plot(NN)
predict_NN = compute(NN, testing.dummy[-1])
test.pred <- as.numeric(ifelse(predict_NN$net.result > 0.5, 1, 0))
mean(test.pred == testing.dummy$action_taken1)
## [1] 0.6166667
confusionMatrix(data = as.factor(test.pred), reference = factor(testing.dummy$action_taken1), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  25  80
##          1  35 160
##                                         
##                Accuracy : 0.6167        
##                  95% CI : (0.559, 0.672)
##     No Information Rate : 0.8           
##     P-Value [Acc > NIR] : 1             
##                                         
##                   Kappa : 0.065         
##                                         
##  Mcnemar's Test P-Value : 4.078e-05     
##                                         
##             Sensitivity : 0.6667        
##             Specificity : 0.4167        
##          Pos Pred Value : 0.8205        
##          Neg Pred Value : 0.2381        
##              Prevalence : 0.8000        
##          Detection Rate : 0.5333        
##    Detection Prevalence : 0.6500        
##       Balanced Accuracy : 0.5417        
##                                         
##        'Positive' Class : 1             
## 
test.roc <- roc(testing.dummy$action_taken1 ~ predict_NN$net.result, plot = TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Warning in roc.default(response, predictors[, 1], ...): Deprecated use a matrix
## as predictor. Unexpected results may be produced, please pass a numeric vector.
## Setting direction: controls < cases

PCR ???

#library(pls)
#use model to make predictions on a test set
#pcr_model <- pcr(action_taken1 ~ ., data = training.bal.dummy, validation = "CV")
#summary(pcr_model)
#pcr_pred <- predict(pcr_model, as.matrix(testing.dummy[-1]), ncomp = 2)

#calculate RMSE
#sqrt(mean((pcr_pred - testing.dummy$action_taken1)^2))


#test.prob.pcr <- pcr_model %>% predict(newx = as.matrix(testing.dummy[-1]))
#test.pred.pcr <- as.numeric(ifelse(pcr_pred > 0.5, 1, 0))
#confusionMatrix(data = as.factor(test.pred.pcr), reference = as.factor(testing.dummy$action_taken1), positive = "1")
#test.pcr.roc <- roc(testing.dummy$action_taken1 ~ as.numeric(pcr_pred), plot = TRUE, print.auc = TRUE)

FAMD

library(FactoMineR)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
options(ggrepel.max.overlaps = Inf)

# unbalanced
famd1 <- FAMD(training[, -1], ncp = 8)

training.famd <- data.frame(famd1$ind$coord)
training.famd$action_taken <- training$action_taken
testing.famd <- data.frame(predict.FAMD(famd1, testing[, -1])$coord)
testing.famd$action_taken <- testing$action_taken
#balanced
famd2 <- FAMD(training.bal[, -1], ncp = 8)

training.bal.famd <- data.frame(famd2$ind$coord)
training.bal.famd$action_taken <- training.bal$action_taken
testing.bal.famd <- data.frame(predict.FAMD(famd2, testing[, -1])$coord)
testing.bal.famd$action_taken <- testing$action_taken

# FAMD unbalanced training set
full <- glm(action_taken ~ ., family = binomial(link = "logit"), data = training.famd)
test.prob <- predict(full, testing.famd, type = "response")
test.pred <- as.numeric(ifelse(test.prob > 0.5, 1, 0))
confusionMatrix(data = as.factor(test.pred), reference = testing$action_taken, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0   1   2
##          1  59 238
##                                           
##                Accuracy : 0.7967          
##                  95% CI : (0.7466, 0.8407)
##     No Information Rate : 0.8             
##     P-Value [Acc > NIR] : 0.591           
##                                           
##                   Kappa : 0.0129          
##                                           
##  Mcnemar's Test P-Value : 7.496e-13       
##                                           
##             Sensitivity : 0.99167         
##             Specificity : 0.01667         
##          Pos Pred Value : 0.80135         
##          Neg Pred Value : 0.33333         
##              Prevalence : 0.80000         
##          Detection Rate : 0.79333         
##    Detection Prevalence : 0.99000         
##       Balanced Accuracy : 0.50417         
##                                           
##        'Positive' Class : 1               
## 
test.roc <- roc(testing$action_taken ~ test.prob, plot = TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

# FAMD balanced training set
full <- glm(action_taken ~ ., family = binomial(link = "logit"), data = training.bal.famd)
test.prob <- predict(full, testing.bal.famd, type = "response")
test.pred <- as.numeric(ifelse(test.prob > 0.5, 1, 0))
confusionMatrix(data = as.factor(test.pred), reference = testing$action_taken, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  27  76
##          1  33 164
##                                           
##                Accuracy : 0.6367          
##                  95% CI : (0.5794, 0.6912)
##     No Information Rate : 0.8             
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1051          
##                                           
##  Mcnemar's Test P-Value : 5.749e-05       
##                                           
##             Sensitivity : 0.6833          
##             Specificity : 0.4500          
##          Pos Pred Value : 0.8325          
##          Neg Pred Value : 0.2621          
##              Prevalence : 0.8000          
##          Detection Rate : 0.5467          
##    Detection Prevalence : 0.6567          
##       Balanced Accuracy : 0.5667          
##                                           
##        'Positive' Class : 1               
## 
test.roc <- roc(testing$action_taken ~ test.prob, plot = TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

Random Forest

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
# class of label/dependent variables should be `factor` -> classification
#training.bal.dummy$action_taken1 <- as.factor(training.bal.dummy$action_taken1)
rf_model <- randomForest(action_taken ~ ., data = training.bal, proximity = TRUE)
print(rf_model)
## 
## Call:
##  randomForest(formula = action_taken ~ ., data = training.bal,      proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 17.44%
## Confusion matrix:
##     0   1 class.error
## 0 445  91   0.1697761
## 1  96 440   0.1791045
# type = "prob" -> gets the probability
rf_prob <- predict(rf_model, testing, type = "prob")[, 2]

#calculate RMSE
#sqrt(mean((rf_pred - testing.dummy$action_taken1)^2))

test.pred.rf <- as.numeric(ifelse(rf_prob > 0.5, 1, 0))
confusionMatrix(data = as.factor(test.pred.rf), reference = testing$action_taken, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  19  51
##          1  41 189
##                                          
##                Accuracy : 0.6933         
##                  95% CI : (0.6378, 0.745)
##     No Information Rate : 0.8            
##     P-Value [Acc > NIR] : 1.0000         
##                                          
##                   Kappa : 0.098          
##                                          
##  Mcnemar's Test P-Value : 0.3481         
##                                          
##             Sensitivity : 0.7875         
##             Specificity : 0.3167         
##          Pos Pred Value : 0.8217         
##          Neg Pred Value : 0.2714         
##              Prevalence : 0.8000         
##          Detection Rate : 0.6300         
##    Detection Prevalence : 0.7667         
##       Balanced Accuracy : 0.5521         
##                                          
##        'Positive' Class : 1              
## 
test.rf.roc <- roc(testing$action_taken ~ rf_prob, plot = TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

# ROC for training set
#rf.roc <- roc(training.bal$action_taken, rf_model$votes[, 2], plot = TRUE, print.auc = TRUE)

# RandomForest based on famd balanced data
rf_model2 <- randomForest(action_taken ~ ., data = training.bal.famd, proximity = TRUE)
print(rf_model2)
## 
## Call:
##  randomForest(formula = action_taken ~ ., data = training.bal.famd,      proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 23.79%
## Confusion matrix:
##     0   1 class.error
## 0 405 131   0.2444030
## 1 124 412   0.2313433
rf_prob <- predict(rf_model2, testing.bal.famd, type = "prob")[, 2]

test.pred.rf <- as.numeric(ifelse(rf_prob > 0.5, 1, 0))
confusionMatrix(data = as.factor(test.pred.rf), reference = testing$action_taken, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  21  54
##          1  39 186
##                                           
##                Accuracy : 0.69            
##                  95% CI : (0.6343, 0.7419)
##     No Information Rate : 0.8             
##     P-Value [Acc > NIR] : 1.0000          
##                                           
##                   Kappa : 0.1143          
##                                           
##  Mcnemar's Test P-Value : 0.1466          
##                                           
##             Sensitivity : 0.7750          
##             Specificity : 0.3500          
##          Pos Pred Value : 0.8267          
##          Neg Pred Value : 0.2800          
##              Prevalence : 0.8000          
##          Detection Rate : 0.6200          
##    Detection Prevalence : 0.7500          
##       Balanced Accuracy : 0.5625          
##                                           
##        'Positive' Class : 1               
## 
test.rf.roc <- roc(testing$action_taken ~ rf_prob, plot = TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

XGBOOST

library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
testing_label <- testing[, 1] #define testing label(copy from below lines)
training_sparse <- sparse.model.matrix(action_taken ~ . - 1, training.bal)
training_label <- training.bal[, 1]
train_matrix <- xgb.DMatrix(data = training_sparse, label = as.numeric(training_label) - 1)
testing_sparse <- sparse.model.matrix(action_taken ~ . - 1, testing)
test_matrix <- xgb.DMatrix(data = testing_sparse, label = as.numeric(testing_label) - 1)

params <- list(booster = "gbtree", objective = "binary:logistic", eta = 0.3, gamma = 0, max_depth = 6, min_child_weight = 1, subsample = 1, colsample_bytree = 1)
xgb.cv <- xgb.cv(params = params, data = train_matrix, nfold = 5, nrounds = 100)
## [23:01:47] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## [23:01:47] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## [23:01:47] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## [23:01:47] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## [23:01:47] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## [1]  train-logloss:0.575328+0.002984 test-logloss:0.615247+0.012894 
## [2]  train-logloss:0.501930+0.005430 test-logloss:0.569148+0.015696 
## [3]  train-logloss:0.444070+0.006646 test-logloss:0.542786+0.024643 
## [4]  train-logloss:0.403595+0.012812 test-logloss:0.522496+0.019828 
## [5]  train-logloss:0.369982+0.014294 test-logloss:0.511582+0.020290 
## [6]  train-logloss:0.345851+0.011975 test-logloss:0.504394+0.023060 
## [7]  train-logloss:0.324557+0.013860 test-logloss:0.497223+0.027897 
## [8]  train-logloss:0.307368+0.016076 test-logloss:0.486923+0.028678 
## [9]  train-logloss:0.293764+0.016048 test-logloss:0.485366+0.029859 
## [10] train-logloss:0.279038+0.017221 test-logloss:0.484922+0.031905 
## [11] train-logloss:0.263453+0.013164 test-logloss:0.478519+0.034179 
## [12] train-logloss:0.251357+0.016550 test-logloss:0.477193+0.033905 
## [13] train-logloss:0.241299+0.014537 test-logloss:0.472791+0.033833 
## [14] train-logloss:0.230877+0.015272 test-logloss:0.473832+0.037299 
## [15] train-logloss:0.222188+0.016450 test-logloss:0.471588+0.038575 
## [16] train-logloss:0.213416+0.015626 test-logloss:0.469947+0.040920 
## [17] train-logloss:0.203699+0.012803 test-logloss:0.468465+0.045120 
## [18] train-logloss:0.195589+0.013835 test-logloss:0.468523+0.046560 
## [19] train-logloss:0.189764+0.014508 test-logloss:0.468528+0.046030 
## [20] train-logloss:0.180556+0.014581 test-logloss:0.466738+0.046268 
## [21] train-logloss:0.174003+0.014733 test-logloss:0.465854+0.047783 
## [22] train-logloss:0.168908+0.015769 test-logloss:0.463761+0.046592 
## [23] train-logloss:0.163050+0.017333 test-logloss:0.462026+0.046306 
## [24] train-logloss:0.155722+0.017056 test-logloss:0.458481+0.043100 
## [25] train-logloss:0.151106+0.017426 test-logloss:0.458941+0.042036 
## [26] train-logloss:0.146893+0.018231 test-logloss:0.458409+0.045516 
## [27] train-logloss:0.140881+0.016855 test-logloss:0.459703+0.045748 
## [28] train-logloss:0.135411+0.014836 test-logloss:0.455239+0.044243 
## [29] train-logloss:0.132932+0.014751 test-logloss:0.455812+0.044813 
## [30] train-logloss:0.127719+0.013357 test-logloss:0.453189+0.044798 
## [31] train-logloss:0.123196+0.013099 test-logloss:0.450902+0.044518 
## [32] train-logloss:0.119075+0.013502 test-logloss:0.450034+0.045058 
## [33] train-logloss:0.114776+0.012050 test-logloss:0.451793+0.044426 
## [34] train-logloss:0.109532+0.010637 test-logloss:0.450099+0.044999 
## [35] train-logloss:0.105817+0.011119 test-logloss:0.450079+0.043500 
## [36] train-logloss:0.102665+0.011020 test-logloss:0.451424+0.045060 
## [37] train-logloss:0.098994+0.010073 test-logloss:0.449638+0.047872 
## [38] train-logloss:0.096057+0.009571 test-logloss:0.449446+0.047866 
## [39] train-logloss:0.093768+0.010869 test-logloss:0.450172+0.049167 
## [40] train-logloss:0.090273+0.009789 test-logloss:0.451410+0.049300 
## [41] train-logloss:0.087471+0.009941 test-logloss:0.451510+0.051148 
## [42] train-logloss:0.084873+0.010605 test-logloss:0.451232+0.048054 
## [43] train-logloss:0.082608+0.010582 test-logloss:0.451934+0.048905 
## [44] train-logloss:0.080523+0.009924 test-logloss:0.452319+0.048454 
## [45] train-logloss:0.078267+0.009568 test-logloss:0.455123+0.047856 
## [46] train-logloss:0.076320+0.008972 test-logloss:0.454527+0.046549 
## [47] train-logloss:0.073697+0.008706 test-logloss:0.454017+0.047504 
## [48] train-logloss:0.071221+0.007831 test-logloss:0.452042+0.048680 
## [49] train-logloss:0.069412+0.008170 test-logloss:0.450622+0.048321 
## [50] train-logloss:0.067979+0.008015 test-logloss:0.450975+0.048672 
## [51] train-logloss:0.066185+0.008269 test-logloss:0.452030+0.047807 
## [52] train-logloss:0.064518+0.008413 test-logloss:0.453741+0.048108 
## [53] train-logloss:0.062381+0.007959 test-logloss:0.452425+0.049620 
## [54] train-logloss:0.061182+0.007974 test-logloss:0.453394+0.049104 
## [55] train-logloss:0.059464+0.007872 test-logloss:0.453838+0.050617 
## [56] train-logloss:0.057926+0.007693 test-logloss:0.454876+0.050739 
## [57] train-logloss:0.056849+0.007496 test-logloss:0.454179+0.051505 
## [58] train-logloss:0.055563+0.007523 test-logloss:0.454796+0.050938 
## [59] train-logloss:0.054687+0.007520 test-logloss:0.455452+0.052946 
## [60] train-logloss:0.053337+0.007451 test-logloss:0.455313+0.054009 
## [61] train-logloss:0.052165+0.007149 test-logloss:0.454925+0.054293 
## [62] train-logloss:0.050944+0.006948 test-logloss:0.456024+0.054014 
## [63] train-logloss:0.049851+0.006989 test-logloss:0.457142+0.053746 
## [64] train-logloss:0.048811+0.006789 test-logloss:0.456656+0.054508 
## [65] train-logloss:0.047653+0.006435 test-logloss:0.456543+0.054898 
## [66] train-logloss:0.046775+0.006389 test-logloss:0.458538+0.055702 
## [67] train-logloss:0.045698+0.006298 test-logloss:0.459710+0.055899 
## [68] train-logloss:0.044676+0.006100 test-logloss:0.460719+0.056485 
## [69] train-logloss:0.043880+0.006006 test-logloss:0.461375+0.058295 
## [70] train-logloss:0.043038+0.005914 test-logloss:0.463882+0.060501 
## [71] train-logloss:0.042217+0.005677 test-logloss:0.464632+0.060130 
## [72] train-logloss:0.041275+0.005421 test-logloss:0.465915+0.060754 
## [73] train-logloss:0.040403+0.005369 test-logloss:0.465292+0.060819 
## [74] train-logloss:0.039618+0.005264 test-logloss:0.465989+0.060903 
## [75] train-logloss:0.038918+0.005151 test-logloss:0.465025+0.060670 
## [76] train-logloss:0.038189+0.005027 test-logloss:0.465463+0.061083 
## [77] train-logloss:0.037413+0.004810 test-logloss:0.467112+0.061760 
## [78] train-logloss:0.036660+0.004687 test-logloss:0.468336+0.060842 
## [79] train-logloss:0.035989+0.004428 test-logloss:0.468957+0.061755 
## [80] train-logloss:0.035332+0.004312 test-logloss:0.468916+0.062188 
## [81] train-logloss:0.034837+0.004255 test-logloss:0.470496+0.061811 
## [82] train-logloss:0.034178+0.004081 test-logloss:0.469951+0.062935 
## [83] train-logloss:0.033736+0.004036 test-logloss:0.469360+0.061852 
## [84] train-logloss:0.033133+0.003929 test-logloss:0.469082+0.062120 
## [85] train-logloss:0.032577+0.003808 test-logloss:0.469844+0.063032 
## [86] train-logloss:0.032004+0.003683 test-logloss:0.470098+0.063230 
## [87] train-logloss:0.031646+0.003724 test-logloss:0.471558+0.063476 
## [88] train-logloss:0.031281+0.003732 test-logloss:0.472890+0.063665 
## [89] train-logloss:0.030859+0.003643 test-logloss:0.473153+0.063413 
## [90] train-logloss:0.030441+0.003599 test-logloss:0.472045+0.063890 
## [91] train-logloss:0.030058+0.003507 test-logloss:0.473470+0.063892 
## [92] train-logloss:0.029588+0.003398 test-logloss:0.473202+0.063907 
## [93] train-logloss:0.029167+0.003299 test-logloss:0.474970+0.063640 
## [94] train-logloss:0.028801+0.003199 test-logloss:0.475443+0.064750 
## [95] train-logloss:0.028409+0.003156 test-logloss:0.476730+0.065356 
## [96] train-logloss:0.028040+0.003127 test-logloss:0.476242+0.065973 
## [97] train-logloss:0.027681+0.003067 test-logloss:0.476400+0.065921 
## [98] train-logloss:0.027268+0.002946 test-logloss:0.476300+0.066208 
## [99] train-logloss:0.026913+0.002930 test-logloss:0.476575+0.067105 
## [100]    train-logloss:0.026521+0.002826 test-logloss:0.477062+0.067162
##best iteration =

xgb1 <- xgb.train(params = params, data = train_matrix, nrounds = 100, watchlist = list(val = test_matrix, train=train_matrix), print_every_n = 10, early_stop_round = 10, maximize = F , eval_metric = "error")
## [23:01:47] WARNING: amalgamation/../src/learner.cc:576: 
## Parameters: { "early_stop_round" } might not be used.
## 
##   This could be a false alarm, with some parameters getting used by language bindings but
##   then being mistakenly passed down to XGBoost core, or some parameter actually being used
##   but getting flagged wrongly here. Please open an issue if you find any such cases.
## 
## 
## [1]  val-error:0.353333  train-error:0.175373 
## [11] val-error:0.313333  train-error:0.061567 
## [21] val-error:0.300000  train-error:0.021455 
## [31] val-error:0.303333  train-error:0.009328 
## [41] val-error:0.310000  train-error:0.001866 
## [51] val-error:0.303333  train-error:0.000000 
## [61] val-error:0.303333  train-error:0.000000 
## [71] val-error:0.310000  train-error:0.000000 
## [81] val-error:0.303333  train-error:0.000000 
## [91] val-error:0.296667  train-error:0.000000 
## [100]    val-error:0.290000  train-error:0.000000
xgb.prob <- predict(xgb1, test_matrix)
xgb.pred <- ifelse (xgb.prob > 0.5, 1, 0)
confusionMatrix(data = as.factor(xgb.pred), reference = testing_label, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  21  48
##          1  39 192
##                                           
##                Accuracy : 0.71            
##                  95% CI : (0.6551, 0.7607)
##     No Information Rate : 0.8             
##     P-Value [Acc > NIR] : 0.9999          
##                                           
##                   Kappa : 0.142           
##                                           
##  Mcnemar's Test P-Value : 0.3911          
##                                           
##             Sensitivity : 0.8000          
##             Specificity : 0.3500          
##          Pos Pred Value : 0.8312          
##          Neg Pred Value : 0.3043          
##              Prevalence : 0.8000          
##          Detection Rate : 0.6400          
##    Detection Prevalence : 0.7700          
##       Balanced Accuracy : 0.5750          
##                                           
##        'Positive' Class : 1               
## 
test.rf.roc <- roc(testing$action_taken ~ xgb.prob, plot = TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

mat1 <- xgb.importance(feature_names = colnames(training_sparse), model = xgb1)
xgb.plot.importance(importance_matrix = mat1)

# xgboost based on famd balanced training data

training_sparse <- sparse.model.matrix(action_taken ~ . - 1, training.bal.famd)
training_label <- training.bal[, 1]
train_matrix <- xgb.DMatrix(data = training_sparse, label = as.numeric(training_label) - 1)
testing_sparse <- sparse.model.matrix(action_taken ~ . - 1, testing.bal.famd)
testing_label <- testing[, 1]
test_matrix <- xgb.DMatrix(data = testing_sparse, label = as.numeric(testing_label) - 1)

params <- list(booster = "gbtree", objective = "binary:logistic", eta = 0.3, gamma = 0, max_depth = 6, min_child_weight = 1, subsample = 1, colsample_bytree = 1)
xgb.cv <- xgb.cv(params = params, data = train_matrix, nfold = 5, nrounds = 100)
## [23:01:48] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## [23:01:48] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## [23:01:48] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## [23:01:48] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## [23:01:48] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## [1]  train-logloss:0.585364+0.005323 test-logloss:0.641555+0.012803 
## [2]  train-logloss:0.510910+0.004971 test-logloss:0.610823+0.022065 
## [3]  train-logloss:0.456689+0.008054 test-logloss:0.592870+0.031595 
## [4]  train-logloss:0.416464+0.007748 test-logloss:0.583032+0.037100 
## [5]  train-logloss:0.385563+0.010093 test-logloss:0.579411+0.042217 
## [6]  train-logloss:0.363716+0.010632 test-logloss:0.574218+0.045401 
## [7]  train-logloss:0.343054+0.012408 test-logloss:0.573916+0.044699 
## [8]  train-logloss:0.321494+0.010599 test-logloss:0.568858+0.043291 
## [9]  train-logloss:0.308004+0.013985 test-logloss:0.567986+0.042024 
## [10] train-logloss:0.294271+0.010289 test-logloss:0.568300+0.046385 
## [11] train-logloss:0.280777+0.014025 test-logloss:0.565394+0.046174 
## [12] train-logloss:0.265999+0.014082 test-logloss:0.566940+0.048169 
## [13] train-logloss:0.252873+0.013566 test-logloss:0.570085+0.052022 
## [14] train-logloss:0.241129+0.011083 test-logloss:0.570801+0.055336 
## [15] train-logloss:0.230598+0.008013 test-logloss:0.567980+0.057385 
## [16] train-logloss:0.223197+0.009210 test-logloss:0.565542+0.057120 
## [17] train-logloss:0.213887+0.008812 test-logloss:0.565326+0.059599 
## [18] train-logloss:0.206797+0.009409 test-logloss:0.565711+0.058902 
## [19] train-logloss:0.197965+0.010242 test-logloss:0.563028+0.061834 
## [20] train-logloss:0.188649+0.008737 test-logloss:0.565524+0.062479 
## [21] train-logloss:0.183562+0.009714 test-logloss:0.569271+0.063995 
## [22] train-logloss:0.176623+0.010398 test-logloss:0.570358+0.062927 
## [23] train-logloss:0.170337+0.010265 test-logloss:0.571165+0.064548 
## [24] train-logloss:0.163256+0.007652 test-logloss:0.571667+0.063890 
## [25] train-logloss:0.159203+0.006868 test-logloss:0.570952+0.064961 
## [26] train-logloss:0.155216+0.006977 test-logloss:0.572274+0.064570 
## [27] train-logloss:0.148178+0.006927 test-logloss:0.572924+0.062525 
## [28] train-logloss:0.143701+0.006058 test-logloss:0.574565+0.062850 
## [29] train-logloss:0.138450+0.006267 test-logloss:0.574484+0.062575 
## [30] train-logloss:0.134303+0.007769 test-logloss:0.576855+0.064248 
## [31] train-logloss:0.130123+0.007243 test-logloss:0.577763+0.066619 
## [32] train-logloss:0.125050+0.007921 test-logloss:0.578214+0.066727 
## [33] train-logloss:0.119356+0.007601 test-logloss:0.576790+0.068997 
## [34] train-logloss:0.114638+0.006381 test-logloss:0.578047+0.067789 
## [35] train-logloss:0.110893+0.006411 test-logloss:0.579404+0.067475 
## [36] train-logloss:0.107472+0.006687 test-logloss:0.581695+0.067379 
## [37] train-logloss:0.103749+0.006775 test-logloss:0.583742+0.068978 
## [38] train-logloss:0.100555+0.006849 test-logloss:0.582975+0.070291 
## [39] train-logloss:0.097605+0.007034 test-logloss:0.583663+0.071660 
## [40] train-logloss:0.094639+0.007080 test-logloss:0.584720+0.072417 
## [41] train-logloss:0.092086+0.006369 test-logloss:0.584141+0.073535 
## [42] train-logloss:0.089178+0.006150 test-logloss:0.585927+0.073351 
## [43] train-logloss:0.086956+0.006941 test-logloss:0.588143+0.073289 
## [44] train-logloss:0.084385+0.006469 test-logloss:0.588597+0.074288 
## [45] train-logloss:0.082045+0.006278 test-logloss:0.590760+0.076168 
## [46] train-logloss:0.080451+0.006298 test-logloss:0.591831+0.076982 
## [47] train-logloss:0.078084+0.005693 test-logloss:0.593621+0.078230 
## [48] train-logloss:0.076211+0.005531 test-logloss:0.595418+0.080243 
## [49] train-logloss:0.074120+0.005305 test-logloss:0.595750+0.080684 
## [50] train-logloss:0.072236+0.005353 test-logloss:0.596068+0.082646 
## [51] train-logloss:0.070609+0.005514 test-logloss:0.597855+0.083252 
## [52] train-logloss:0.068738+0.005227 test-logloss:0.601925+0.082556 
## [53] train-logloss:0.067056+0.004848 test-logloss:0.602203+0.083003 
## [54] train-logloss:0.065508+0.004865 test-logloss:0.602596+0.083258 
## [55] train-logloss:0.063658+0.004763 test-logloss:0.605338+0.083968 
## [56] train-logloss:0.061651+0.004266 test-logloss:0.607431+0.081741 
## [57] train-logloss:0.060358+0.004081 test-logloss:0.609811+0.082161 
## [58] train-logloss:0.059187+0.003809 test-logloss:0.611453+0.082976 
## [59] train-logloss:0.057923+0.003669 test-logloss:0.612158+0.082891 
## [60] train-logloss:0.056807+0.003516 test-logloss:0.615115+0.084260 
## [61] train-logloss:0.055713+0.003565 test-logloss:0.615966+0.085714 
## [62] train-logloss:0.054515+0.003650 test-logloss:0.618053+0.086857 
## [63] train-logloss:0.053397+0.003696 test-logloss:0.618726+0.086661 
## [64] train-logloss:0.052119+0.003733 test-logloss:0.620697+0.088931 
## [65] train-logloss:0.051176+0.003463 test-logloss:0.621296+0.088552 
## [66] train-logloss:0.050049+0.003325 test-logloss:0.622687+0.089402 
## [67] train-logloss:0.048811+0.003066 test-logloss:0.624679+0.090095 
## [68] train-logloss:0.047778+0.002979 test-logloss:0.626513+0.091374 
## [69] train-logloss:0.046798+0.002911 test-logloss:0.628137+0.091883 
## [70] train-logloss:0.045988+0.002848 test-logloss:0.629469+0.091519 
## [71] train-logloss:0.045283+0.002717 test-logloss:0.631048+0.092421 
## [72] train-logloss:0.044272+0.002583 test-logloss:0.634337+0.092077 
## [73] train-logloss:0.043631+0.002504 test-logloss:0.634253+0.092245 
## [74] train-logloss:0.042993+0.002364 test-logloss:0.633787+0.092208 
## [75] train-logloss:0.042196+0.002478 test-logloss:0.635289+0.091771 
## [76] train-logloss:0.041289+0.002391 test-logloss:0.636420+0.091375 
## [77] train-logloss:0.040616+0.002268 test-logloss:0.634997+0.091845 
## [78] train-logloss:0.039842+0.002260 test-logloss:0.636639+0.092253 
## [79] train-logloss:0.039181+0.002205 test-logloss:0.636972+0.093622 
## [80] train-logloss:0.038519+0.002150 test-logloss:0.640835+0.092698 
## [81] train-logloss:0.037860+0.002151 test-logloss:0.642254+0.093535 
## [82] train-logloss:0.037427+0.002146 test-logloss:0.643002+0.094331 
## [83] train-logloss:0.036773+0.002067 test-logloss:0.643898+0.094224 
## [84] train-logloss:0.036281+0.002178 test-logloss:0.645239+0.095216 
## [85] train-logloss:0.035751+0.002111 test-logloss:0.645972+0.094806 
## [86] train-logloss:0.035313+0.002025 test-logloss:0.646102+0.095244 
## [87] train-logloss:0.034747+0.002061 test-logloss:0.647250+0.095034 
## [88] train-logloss:0.034342+0.002093 test-logloss:0.647530+0.095802 
## [89] train-logloss:0.033796+0.001991 test-logloss:0.647775+0.094590 
## [90] train-logloss:0.033463+0.002076 test-logloss:0.649119+0.094242 
## [91] train-logloss:0.033007+0.002044 test-logloss:0.648761+0.093406 
## [92] train-logloss:0.032470+0.001980 test-logloss:0.649214+0.093089 
## [93] train-logloss:0.032085+0.001945 test-logloss:0.650466+0.095023 
## [94] train-logloss:0.031636+0.001789 test-logloss:0.650337+0.096600 
## [95] train-logloss:0.031314+0.001873 test-logloss:0.651095+0.095395 
## [96] train-logloss:0.030904+0.001804 test-logloss:0.652770+0.097038 
## [97] train-logloss:0.030469+0.001789 test-logloss:0.653858+0.097896 
## [98] train-logloss:0.030078+0.001741 test-logloss:0.655069+0.097545 
## [99] train-logloss:0.029673+0.001658 test-logloss:0.657145+0.097130 
## [100]    train-logloss:0.029300+0.001659 test-logloss:0.658557+0.097644
##best iteration =

xgb2 <- xgb.train(params = params, data = train_matrix, nrounds = 100, watchlist = list(val = test_matrix, train=train_matrix), print_every_n = 10, early_stop_round = 10, maximize = F , eval_metric = "error")
## [23:01:48] WARNING: amalgamation/../src/learner.cc:576: 
## Parameters: { "early_stop_round" } might not be used.
## 
##   This could be a false alarm, with some parameters getting used by language bindings but
##   then being mistakenly passed down to XGBoost core, or some parameter actually being used
##   but getting flagged wrongly here. Please open an issue if you find any such cases.
## 
## 
## [1]  val-error:0.340000  train-error:0.197761 
## [11] val-error:0.360000  train-error:0.076493 
## [21] val-error:0.360000  train-error:0.033582 
## [31] val-error:0.380000  train-error:0.006530 
## [41] val-error:0.366667  train-error:0.000000 
## [51] val-error:0.363333  train-error:0.000000 
## [61] val-error:0.376667  train-error:0.000000 
## [71] val-error:0.376667  train-error:0.000000 
## [81] val-error:0.383333  train-error:0.000000 
## [91] val-error:0.373333  train-error:0.000000 
## [100]    val-error:0.370000  train-error:0.000000
xgb.prob <- predict(xgb2, test_matrix)
xgb.pred <- ifelse(xgb.prob > 0.5, 1, 0)
confusionMatrix(data = as.factor(xgb.pred), reference = testing_label, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  22  73
##          1  38 167
##                                           
##                Accuracy : 0.63            
##                  95% CI : (0.5726, 0.6848)
##     No Information Rate : 0.8             
##     P-Value [Acc > NIR] : 1.00000         
##                                           
##                   Kappa : 0.0513          
##                                           
##  Mcnemar's Test P-Value : 0.00125         
##                                           
##             Sensitivity : 0.6958          
##             Specificity : 0.3667          
##          Pos Pred Value : 0.8146          
##          Neg Pred Value : 0.2316          
##              Prevalence : 0.8000          
##          Detection Rate : 0.5567          
##    Detection Prevalence : 0.6833          
##       Balanced Accuracy : 0.5312          
##                                           
##        'Positive' Class : 1               
## 
test.rf.roc <- roc(testing$action_taken ~ xgb.prob, plot = TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

mat2 <- xgb.importance(feature_names = colnames(training_sparse), model = xgb2)
xgb.plot.importance(importance_matrix = mat2)

SVM

library(e1071)
tune.out <- e1071::tune(svm,action_taken1 ~ ., data = training.bal.dummy, kernel = "linear", ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 100)))
# extract the best model
(bestmod <- tune.out$best.model)
## 
## Call:
## best.tune(method = svm, train.x = action_taken1 ~ ., data = training.bal.dummy, 
##     ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 100)), kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  linear 
##        cost:  0.001 
##       gamma:  0.03846154 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  1022
svmfit = e1071::svm(action_taken1 ~ ., data = training.bal.dummy, kernel = "linear", cost = 0.001, scale = FALSE)

probs <- predict(svmfit, testing.dummy)
preds <- as.numeric(ifelse(probs > 0.5, 1, 0))

confusionMatrix(data = as.factor(preds), reference = as.factor(testing.dummy$action_taken1), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  44 133
##          1  16 107
##                                           
##                Accuracy : 0.5033          
##                  95% CI : (0.4453, 0.5613)
##     No Information Rate : 0.8             
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1035          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.4458          
##             Specificity : 0.7333          
##          Pos Pred Value : 0.8699          
##          Neg Pred Value : 0.2486          
##              Prevalence : 0.8000          
##          Detection Rate : 0.3567          
##    Detection Prevalence : 0.4100          
##       Balanced Accuracy : 0.5896          
##                                           
##        'Positive' Class : 1               
## 
test.roc <- roc(testing.dummy$action_taken1 ~ probs, plot = TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases